Overview

The objective of this analysis is twofold:

  • Creating a regression model with high prediction accuracy (using adjusted R-squared while minimizing AIC), then using the residuals from the model to identify neighborhoods that could be considered undervalued. The goal of this model is precision, and it’s not focused on the identifying the extent to which individual features are influential.
  • Creating a simpler model that does account for multicolinearity, so as to get a better sense of which individual features are most influential.

Load libraries

library(dplyr)
library(data.table)
library(leaflet)
library(car)
library(RColorBrewer)
library(geojsonio)
library(sp)
library(pedometrics)
library(leafsync)

Load data

knitr::opts_knit$set(root.dir = normalizePath("../analysis/data/"))
path <- knitr::opts_knit$get("root.dir")
df <- read.csv(paste0(path, '/final_data.csv'))

Preview the data

head(df[order(df$num_props),])
##           GEOID num_props median_value pct_owned BEDROOMS    BATHS BLDG_AREA
## 192 55079120300         2       139950 0.5000000 3.500000 1.000000  3361.500
## 206 55079186900         4       863450 0.5000000 5.250000 3.750000 10741.000
## 193 55079120400         5       138700 1.0000000 3.200000 1.400000  3025.400
## 190 55079090200         6       225150 0.8333333 3.500000 1.833333  4618.500
## 191 55079110100        15        95000 0.8000000 2.800000 1.066667  2508.800
## 110 55079011300        21       168900 0.5238095 2.809524 1.333333  2818.619
##      LOT_AREA AIR_CONDITIONING    garage      vacant pct_residential
## 192  7130.500        1.0000000 1.0000000 0.000000000       1.0000000
## 206 11215.000        0.5000000 0.5000000 0.000000000       0.8384432
## 193  5400.000        0.8000000 1.0000000 0.000000000       1.0000000
## 190  8688.333        1.0000000 1.0000000 0.000000000       1.0000000
## 191  4981.733        0.5333333 0.7333333 0.000000000       0.4128271
## 110  2886.238        0.2857143 0.4285714 0.004587156       0.5751411
##     pct_single_fam median_hh_income population pct_white pct_black pct_hisp
## 192     0.47738408            60000       2102      75.1       3.3     20.0
## 206     0.02374724            53345       2353      84.8       3.2      1.9
## 193     0.16097691            55365       6810      73.5       4.7     14.9
## 190     1.00000000            82368       1926      78.9       8.5      2.3
## 191     0.24015316            35286       4200      48.7      17.3     29.9
## 110     0.01850200            43077       2052      80.3       9.9      4.8
##     pct_asian pct_coll_plus   ALAND pop_density Homicide other_violent_crime
## 192       0.0          29.4  885487      23.738        0                   0
## 206       8.1          72.3  427986      54.978        0                  16
## 193       2.5          23.3 4795004      14.202        0                  12
## 190       8.5          57.9 3817170       5.046        0                   0
## 191       2.9          19.6 2897237      14.497        0                   5
## 110       3.1          66.8  433646      47.320        1                 105
##     property_crime violent_crime_trend
## 192              1                   0
## 206            193                  -2
## 193             12                  -4
## 190              1                   0
## 191             20                  -1
## 110            678                   8
### Four of these tracts have a single digit number of properties, and including
### some of them in initial models (particularly the outlier wiht a median value
### of $863K) was influencing the model, so let's remove them
df <- df[df$num_props > 10,]

Exploring relationships with median value

### visualize the relationships to get a better sense of the relationship of 
### each variable with median value
par(mfrow=c(8, 3), mar=c(2,2,2,2))
y <- df$median_value

for (i in 4:26)
{
  title <- colnames(df[i])
  plot(df[,i], y, main=title, xlab=title, ylab='median value')
  abline(lm(y~df[,i]), col='red')
}

### In looking at the above plots, vacancy, homicide and violent crime have a clear,
### negative relationship with home values, but only up to a certain point, so we'll
### experiment with creating log versions of those fields
## Because the log transformations can create Inf values, we'll first set fields with
## values of zero to be negligibly positive
df$vacant[df$vacant== 0] <- .0001
df$vacLog <- log(df$vacant)

### because homicides and "other" violent crime are closely correlated and have very
### similar relationships with median home value, let's simplify this by combining
### them into one field
df$violentCrime <- df$Homicide + df$other_violent_crime
df$violentCrime[df$violentCrime== 0] <- .0001
df$violentCrimeLog <- log(df$violentCrime)


## Do the log versions of these fields have a stronger relationship with home values? Yes.
sprintf("vacancy: %s; vacLog: %s", cor(df$vacant, df$median_value)[1],
        cor(df$vacLog, df$median_value)[1])
## [1] "vacancy: -0.540622789490693; vacLog: -0.710353939500846"
sprintf("violent crime: %s; log violent crime: %s", cor(df$violentCrime, df$median_value)[1],
        cor(df$violentCrimeLog, df$median_value)[1])
## [1] "violent crime: -0.548009059130518; log violent crime: -0.584100423375757"
### let's look at correlations bewteen median value and potential features
cors <- sort(cor(df[3:29])[,1], decreasing=TRUE)
cors
##        median_value       pct_coll_plus           pct_white               BATHS 
##          1.00000000          0.79627323          0.73345942          0.71283848 
##           BLDG_AREA    median_hh_income           pct_owned    AIR_CONDITIONING 
##          0.70649176          0.69266722          0.63483912          0.47054985 
##              garage            LOT_AREA            BEDROOMS          population 
##          0.31229880          0.30921344          0.24793719          0.18172322 
##     pct_residential               ALAND      pct_single_fam         pop_density 
##          0.15833838          0.14995737          0.11452208          0.02989415 
##           pct_asian violent_crime_trend            pct_hisp      property_crime 
##         -0.03525563         -0.05985659         -0.11785635         -0.13047943 
##            Homicide              vacant           pct_black other_violent_crime 
##         -0.48931972         -0.54062279         -0.54642153         -0.54694953 
##        violentCrime     violentCrimeLog              vacLog 
##         -0.54800906         -0.58410042         -0.71035394

After testing a variety of combinations of different terms and their interactions, below is my initial model (focused on prediction accuracy)

mod1 <- lm(median_value~BLDG_AREA+BATHS+pct_coll_plus+pct_white
                +vacLog+pct_black+garage+BEDROOMS+property_crime+I(BATHS*BLDG_AREA)
                +I(BATHS*BEDROOMS)+I(BLDG_AREA*BEDROOMS)
                +I(pct_black*pct_coll_plus)+I(pct_black*vacLog)
                +I(pct_white*property_crime)+violentCrimeLog
                +I(pct_black*violentCrimeLog)
                , data = df)

summary(mod1)
## 
## Call:
## lm(formula = median_value ~ BLDG_AREA + BATHS + pct_coll_plus + 
##     pct_white + vacLog + pct_black + garage + BEDROOMS + property_crime + 
##     I(BATHS * BLDG_AREA) + I(BATHS * BEDROOMS) + I(BLDG_AREA * 
##     BEDROOMS) + I(pct_black * pct_coll_plus) + I(pct_black * 
##     vacLog) + I(pct_white * property_crime) + violentCrimeLog + 
##     I(pct_black * violentCrimeLog), data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -47578  -7286    758   7498  36792 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -1.604e+05  5.286e+04  -3.033 0.002765 ** 
## BLDG_AREA                      -4.259e-01  2.848e+01  -0.015 0.988085    
## BATHS                          -2.281e+04  6.496e+04  -0.351 0.725853    
## pct_coll_plus                   1.317e+03  1.723e+02   7.647 1.09e-12 ***
## pct_white                      -1.581e+01  1.505e+02  -0.105 0.916419    
## vacLog                         -2.173e+03  1.859e+03  -1.169 0.243939    
## pct_black                      -2.001e+03  6.234e+02  -3.210 0.001564 ** 
## garage                          2.114e+04  7.721e+03   2.738 0.006779 ** 
## BEDROOMS                        1.346e+05  2.797e+04   4.812 3.09e-06 ***
## property_crime                 -2.576e+01  1.206e+01  -2.136 0.033992 *  
## I(BATHS * BLDG_AREA)            9.937e+01  1.310e+01   7.587 1.55e-12 ***
## I(BATHS * BEDROOMS)            -7.266e+04  2.576e+04  -2.821 0.005312 ** 
## I(BLDG_AREA * BEDROOMS)        -2.388e+01  6.283e+00  -3.801 0.000196 ***
## I(pct_black * pct_coll_plus)   -1.984e+01  4.414e+00  -4.495 1.22e-05 ***
## I(pct_black * vacLog)          -1.510e+02  3.966e+01  -3.809 0.000190 ***
## I(pct_white * property_crime)   8.165e-01  2.407e-01   3.392 0.000848 ***
## violentCrimeLog                -4.442e+03  3.805e+03  -1.167 0.244622    
## I(pct_black * violentCrimeLog)  2.256e+02  9.057e+01   2.491 0.013611 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13900 on 185 degrees of freedom
## Multiple R-squared:  0.9654, Adjusted R-squared:  0.9623 
## F-statistic:   304 on 17 and 185 DF,  p-value: < 2.2e-16
sprintf('AIC: %s', AIC(mod1))
## [1] "AIC: 4468.41899606828"
### The residuals are normally distributed, and it doesn't look like
### any of the observations are too influential
hist(resid(mod1))

par(mfrow=c(2,2))
plot(mod1)

### add predicted values and residuals to the df so that we can visualize them
df$preds = round(predict(mod1), 0)
df$residuals = round(resid(mod1), 0)

Visualize residuals

### navigate to geo folder
knitr::opts_knit$set(root.dir = normalizePath('../../geo/'))
# knitr::opts_knit$set(root.dir = normalizePath('../geo/'))
geo_path <- knitr::opts_knit$get("root.dir")

tracts <- geojson_read(paste0(geo_path, '/mke_county_tract.geojson'), what = "sp")

### merge df and geo obj, inner join only
tracts_df <- merge(tracts, df, by='GEOID', all=FALSE)
 
m1Bins <- c(min(tracts_df$residuals), -35000, -25000, -15000, -5000, 5000,
          15000, 25000, 35000, max(tracts_df$residuals))
m1Pal <- colorBin('RdYlGn', domain = tracts_df$residuals, bins = m1Bins, reverse = TRUE)

### Create labels for hovering
m1Labels <- sprintf(
          '<strong> %s</strong><br/>Expected value: <strong>$%g</strong><br/>
          Actual value: <strong>$%g</strong><br/>Difference: <strong>$%g</strong>',
          tracts_df$NAMELSAD, tracts_df$preds,
          tracts_df$median_value, tracts_df$residuals
) %>% lapply(htmltools::HTML)

m1 <- leaflet(tracts_df) %>%
  setView(lat=43.060174, lng=-87.925549, zoom = 11) %>%
  addTiles() %>%
  addPolygons(
    fillColor = ~m1Pal(tracts_df$residuals),
    weight = 1,
    color = 'white',
    opacity = 0.75,
    fillOpacity = 0.75,
    ### add hover-over capability
    highlight = highlightOptions(
    weight = 5,
    color = "#666",
    fillOpacity = 0.7,
    bringToFront = TRUE),
  ### adding labels for hover tool
  label = m1Labels,
  labelOptions = labelOptions(
    style = list("font-weight" = "normal", padding = "3px 8px"),
    textsize = "11px",
    direction = "auto")
  ) %>%
  ### add legend
  addLegend(pal = m1Pal, values = ~density, opacity = 0.7,
            title = 'Expected minus<br>Actual Value',
            position = "topright")

### map for actual values only
m2Bins <- c(min(tracts_df$median_value), 30000, 60000, 90000, 120000, 150000,
          180000, 210000, 240000, max(tracts_df$median_value))
m2Ppal <- colorBin('Blues', domain = tracts_df$median_value, bins = m2Bins)

### Create labels for hovering
m2Labels <- sprintf(
          '<strong> %s</strong><br/>Median value: <strong>$%g</strong>',
          tracts_df$NAMELSAD,tracts_df$median_value
        ) %>% lapply(htmltools::HTML)

m2 <- leaflet(tracts_df) %>%
  setView(lat=43.060174, lng=-87.925549, zoom = 11) %>%
  addTiles() %>%
  addPolygons(
    fillColor = ~m2Ppal(tracts_df$median_value),
    weight = 1,
    color = 'white',
    opacity = 0.75,
    fillOpacity = 0.75,
    ### add hover-over capability
    highlight = highlightOptions(
    weight = 5,
    color = "#666",
    fillOpacity = 0.7,
    bringToFront = TRUE),
  ### adding labels for hover tool
  label = m2Labels,
  labelOptions = labelOptions(
    style = list("font-weight" = "normal", padding = "3px 8px"),
    textsize = "11px",
    direction = "auto")
  ) %>%
  ### add legend
  addLegend(pal = m2Ppal, values = ~density, opacity = 0.7,
            title = 'Median value',
            position = "topright")

In the below plots, we can see that one neighborhood on the west side of the city really stands out for having an actual median value that is nearly $50K below what the model expects the value to be. Hover over neighborhoods for details.

latticeview(m1, m2, ncol=2)

Model #2 (Simpler appraoch focused on understanding influence of individual variables)

### To be able to make comparisons between different variables, first scale the data
### (excluding GEOID and median value)
scaled <- as.data.frame(scale(subset(df, select=-c(GEOID, median_value))))

### join back in geo and median val cols
scaled <- cbind(subset(df, select=c(GEOID, median_value)), scaled)

### final simple model
mod2 <- lm(median_value~BATHS+median_hh_income+pct_black+
                      pct_coll_plus+property_crime+vacLog,
                      data = scaled)

summary(mod2)
## 
## Call:
## lm(formula = median_value ~ BATHS + median_hh_income + pct_black + 
##     pct_coll_plus + property_crime + vacLog, data = scaled)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -57609 -13137   2045  10896  54836 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        106690       1493  71.461  < 2e-16 ***
## BATHS               34478       1786  19.300  < 2e-16 ***
## median_hh_income    13187       2339   5.637 5.96e-08 ***
## pct_black          -10347       2020  -5.122 7.20e-07 ***
## pct_coll_plus       15422       2343   6.583 4.13e-10 ***
## property_crime       5196       1576   3.297  0.00116 ** 
## vacLog             -20373       2182  -9.338  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21270 on 196 degrees of freedom
## Multiple R-squared:  0.9143, Adjusted R-squared:  0.9117 
## F-statistic: 348.5 on 6 and 196 DF,  p-value: < 2.2e-16
### And we can see that multicolinearity is fairly low
vif(mod2)
##            BATHS median_hh_income        pct_black    pct_coll_plus 
##         1.424669         2.442891         1.821536         2.450280 
##   property_crime           vacLog 
##         1.108642         2.125138
## residuals are normally distributed, though there are a few on the low end that stand out
hist(resid(mod2))

## no obvservations are too skewed or are having too much influence over the model
par(mfrow = c(2,2))
plot(mod2)

In the below plot, which shows the influence of each variable in the model, we can see that the average number of bathrooms – which it’s worth noting is strongly correlated with average home size, lot size and the average number of bedrooms – has the greatest influence on average home values in a neighborhood. The model also finds that even when controlling for several other factors, neighborhoods with larger black populations have lower property values, though the influence of that variable is smaller than most other variables. And while it’s hard to imagine that other fields, such as say, violent crime, don’t have a significant effect on property values, the effect of them is difficult to interpret because adding them quickly produces high multicolinearity within the model.

par(mar = c(4,6,1,1))
barplot(sort(coef(mod2)[2:7]), xlab = 'Coefficients',
        horiz = T, las=2, col='blue',
        names.arg=c("Vacancy", "% Black", "Prop Crime", "Income",
        "% Coll. deg.", "Baths"))
abline(v=0)